{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare Glove vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import codecs\n",
    "import pickle\n",
    "import operator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def loadGloveModel(gloveFile):\n",
    "    print \"Loading Glove Model\"\n",
    "    f = codecs.open(gloveFile,'r')\n",
    "    \n",
    "    model = {}\n",
    "    for line in f:\n",
    "        splitLine = line.split()\n",
    "        word = splitLine[0]\n",
    "        embedding = [float(val) for val in splitLine[1:]]\n",
    "        model[word] = embedding\n",
    "    print \"Done.\",len(model),\" words loaded!\"\n",
    "    return model\n",
    "\n",
    "    f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cal_coverage(voca, glove):\n",
    "\n",
    "    cnt = 0\n",
    "    for token in voca.keys():\n",
    "        if glove.has_key( token ):\n",
    "            ;\n",
    "        else:\n",
    "            cnt = cnt + 1  \n",
    "\n",
    "    print '# missing token : ' + str(cnt)\n",
    "    print 'coverage : ' + str( 1 - ( cnt/ float(len(voca)) ) ) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_glove_embedding(voca, glove):\n",
    "\n",
    "    # sorting voca dic by value\n",
    "    sorted_voca = sorted(voca.items(), key=operator.itemgetter(1))\n",
    "    \n",
    "    list_glove_voca = []\n",
    "\n",
    "    cnt = 0\n",
    "\n",
    "    for token, value in sorted_voca:\n",
    "\n",
    "        if glove.has_key( token ):\n",
    "            list_glove_voca.append( glove[token] )\n",
    "        else:\n",
    "            if token == '_PAD_':\n",
    "                print 'add PAD as 0s'\n",
    "                list_glove_voca.append( np.zeros(300) )  \n",
    "            else:\n",
    "                list_glove_voca.append( np.random.uniform(-0.25, 0.25, 300).tolist() )\n",
    "                cnt = cnt + 1  \n",
    "\n",
    "    print 'coverage : ' + str( 1 - ( cnt/ float(len(voca)) ) )\n",
    "    return list_glove_voca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 01 load glove model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Glove Model\n",
      "Done. 2196016  words loaded!\n"
     ]
    }
   ],
   "source": [
    "glove = loadGloveModel('../data/raw/embedding/glove.840B.300d.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# glove_twit = loadGloveModel('../data/raw/embedding/glove.twitter.27B.200d.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 02-A process IEMOCAP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total dic size : 3747\n",
      "# missing token : 423\n",
      "coverage : 0.88710968775\n"
     ]
    }
   ],
   "source": [
    "dic = {}\n",
    "with open('../data/processed/IEMOCAP/dic.pkl') as f:\n",
    "    dic = pickle.load(f)\n",
    "print 'total dic size : ' + str(len(dic))\n",
    "\n",
    "cal_coverage(dic, glove)\n",
    "# cal_coverage(dic, glove_twit)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "add PAD as 0s\n",
      "coverage : 0.887376567921\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "3747"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_glove_voca = create_glove_embedding(dic, glove)\n",
    "len(list_glove_voca)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3747, 300)\n"
     ]
    }
   ],
   "source": [
    "np_glove = np.asarray(list_glove_voca, dtype=np.float32)\n",
    "print np.shape(np_glove)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('../data/processed/IEMOCAP/W_embedding.npy', np_glove)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 02-B process IEMOCAP (G speech api case)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dic = {}\n",
    "# with open('../data/processed/IEMOCAP/dic_G.pkl') as f:\n",
    "#     dic = pickle.load(f)\n",
    "# print 'total dic size : ' + str(len(dic))\n",
    "\n",
    "# cal_coverage(dic, glove)\n",
    "# # cal_coverage(dic, glove_twit)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list_glove_voca = create_glove_embedding(dic, glove)\n",
    "# len(list_glove_voca)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# np_glove = np.asarray(list_glove_voca, dtype=np.float32)\n",
    "# print np.shape(np_glove)\n",
    "\n",
    "# np.save('../data/processed/IEMOCAP/W_embedding_G.npy', np_glove)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:tf14_p27]",
   "language": "python",
   "name": "conda-env-tf14_p27-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
